library(readr)
library(MVA)
## Loading required package: HSAUR2
## Loading required package: tools
library(HSAUR2)
library(SciViews)
library(scatterplot3d)
library(car)
## Loading required package: carData
library(lattice)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(ggridges)
library(ggvis)
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
library(ggthemes)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
##
## theme_map
library(gapminder)
library(gganimate)
## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session
##
## Attaching package: 'gganimate'
## The following object is masked from 'package:ggvis':
##
## view_static
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ ggvis::resolution() masks ggplot2::resolution()
## ✖ purrr::some() masks car::some()
## ✖ lubridate::stamp() masks cowplot::stamp()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(RColorBrewer)
library(Hotelling)
## Loading required package: corpcor
##
## Attaching package: 'Hotelling'
##
## The following object is masked from 'package:dplyr':
##
## summarise
library(stats)
library(biotools)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## ---
## biotools version 4.2
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
library(ggfortify)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## The following object is masked from 'package:car':
##
## logit
library(corrplot)
## corrplot 0.92 loaded
library(devtools)
## Loading required package: usethis
library(cluster)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(NbClust)
library(MASS)
library(gvlma)
library(leaps)
library(relaimpo)
## Loading required package: boot
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:psych':
##
## logit
##
## The following object is masked from 'package:lattice':
##
## melanoma
##
## The following object is masked from 'package:car':
##
## logit
##
## Loading required package: survey
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## The following object is masked from 'package:ggvis':
##
## band
##
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:boot':
##
## aml
##
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
##
## Loading required package: mitools
## This is the global version of package relaimpo.
##
## If you are a non-US user, a version with the interesting additional metric pmvd is available
##
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(memisc)
##
## Attaching package: 'memisc'
##
## The following object is masked from 'package:Matrix':
##
## as.array
##
## The following object is masked from 'package:magrittr':
##
## %$%
##
## The following objects are masked from 'package:lubridate':
##
## as.interval, is.interval
##
## The following object is masked from 'package:purrr':
##
## %@%
##
## The following object is masked from 'package:tibble':
##
## view
##
## The following objects are masked from 'package:dplyr':
##
## collect, recode, rename, syms
##
## The following object is masked from 'package:ggplot2':
##
## syms
##
## The following object is masked from 'package:car':
##
## recode
##
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
##
## The following object is masked from 'package:base':
##
## as.array
library(ROCR)
library(klaR)
clg <- read.csv("/Users/ajayvishnu/Desktop/RUTGERS/Spring_2023/Multivariate Analysis/Datasets/College_Data_cleaned.csv", row.names = 1)
attach(clg)
str(clg)
## 'data.frame': 30 obs. of 19 variables:
## $ Private : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Apps : int 15712 4269 8579 48094 1879 19315 15698 5785 5095 14752 ...
## $ Accept : int 11719 2594 5561 26330 1216 10344 10775 2690 4491 9572 ...
## $ Acceptance_70: int 1 0 0 0 0 0 0 0 1 0 ...
## $ Enroll : int 4277 985 3681 4520 483 3450 2478 499 2400 5329 ...
## $ Top10perc : int 29 27 25 36 27 48 85 26 27 48 ...
## $ Top25perc : int 53 57 50 79 62 93 100 62 53 85 ...
## $ F.Undergrad : int 18511 6476 17880 21401 3311 28938 12677 4005 13894 30017 ...
## $ P.Undergrad : int 604 2592 1673 3712 1646 2025 864 1886 8374 5189 ...
## $ Outstate : int 10260 8594 6994 7410 8832 10645 12024 7410 6857 5130 ...
## $ Room.Board : int 3176 4408 3384 4748 5376 4060 5302 4748 3975 3309 ...
## $ Books : int 740 494 700 690 700 512 790 690 858 650 ...
## $ Personal : int 2200 2768 2681 2009 1850 2394 1818 2009 3093 3140 ...
## $ PhD : int 85 82 88 90 92 77 96 90 89 91 ...
## $ Terminal : int 89 88 94 95 98 96 96 95 93 99 ...
## $ S.F.Ratio : num 13.8 18.4 13.7 19.5 13.5 18.1 16.1 17.4 12.8 19.7 ...
## $ perc.alumni : int 20 6 17 19 19 19 11 16 9 11 ...
## $ Expend : int 8944 7618 9657 10474 12529 8992 15934 11878 9275 7837 ...
## $ Grad.Rate : int 73 55 57 77 72 63 66 58 37 65 ...
corrplot(cor(clg), type = "upper", method = "color")
###PCA
clg_pca <- prcomp(clg[,-1],scale=TRUE)
fviz_eig(clg_pca, addlabels = TRUE)
fviz_pca_var(clg_pca,col.var = "cos2",
gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
repel = TRUE)
res.pca <- PCA(clg[,-1], graph = FALSE)
fviz_pca_ind(res.pca)
fviz_pca_biplot(res.pca, repel = TRUE,
col.var = "#FC4E07",
)
clg_dist <- get_dist(clg[,-1], stand = TRUE, method = "euclidean")
matstd_clg <- scale(clg[,-1])
#scaled
fviz_nbclust(matstd_clg, kmeans, method = "gap_stat")
fviz_nbclust <- function (x, FUNcluster = NULL, method = c("silhouette", "wss",
"gap_stat"), diss = NULL, k.max = 10, nboot = 100, verbose = interactive(),
barfill = "steelblue", barcolor = "steelblue", linecolor = "steelblue",
print.summary = TRUE, ...)
{
set.seed(123)
if (k.max < 2)
stop("k.max must bet > = 2")
method = match.arg(method)
if (!inherits(x, c("data.frame", "matrix")) & !("Best.nc" %in%
names(x)))
stop("x should be an object of class matrix/data.frame or ",
"an object created by the function NbClust() [NbClust package].")
if (inherits(x, "list") & "Best.nc" %in% names(x)) {
best_nc <- x$Best.nc
if (any(class(best_nc) == "numeric") )
print(best_nc)
else if (any(class(best_nc) == "matrix") )
.viz_NbClust(x, print.summary, barfill, barcolor)
}
else if (is.null(FUNcluster))
stop("The argument FUNcluster is required. ", "Possible values are kmeans, pam, hcut, clara, ...")
else if (!is.function(FUNcluster)) {
stop("The argument FUNcluster should be a function. ",
"Check if you're not overriding the specified function name somewhere.")
}
else if (method %in% c("silhouette", "wss")) {
if (is.data.frame(x))
x <- as.matrix(x)
if (is.null(diss))
diss <- stats::dist(x)
v <- rep(0, k.max)
if (method == "silhouette") {
for (i in 2:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_ave_sil_width(diss, clust$cluster)
}
}
else if (method == "wss") {
for (i in 1:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_withinSS(diss, clust$cluster)
}
}
df <- data.frame(clusters = as.factor(1:k.max), y = v,
stringsAsFactors = TRUE)
ylab <- "Total Within Sum of Square"
if (method == "silhouette")
ylab <- "Average silhouette width"
p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1,
color = linecolor, ylab = ylab, xlab = "Number of clusters k",
main = "Optimal number of clusters")
if (method == "silhouette")
p <- p + geom_vline(xintercept = which.max(v), linetype = 2,
color = linecolor)
return(p)
}
else if (method == "gap_stat") {
extra_args <- list(...)
gap_stat <- cluster::clusGap(x, FUNcluster, K.max = k.max,
B = nboot, verbose = verbose, ...)
if (!is.null(extra_args$maxSE))
maxSE <- extra_args$maxSE
else maxSE <- list(method = "firstSEmax", SE.factor = 1)
p <- fviz_gap_stat(gap_stat, linecolor = linecolor,
maxSE = maxSE)
return(p)
}
}
.viz_NbClust <- function (x, print.summary = TRUE, barfill = "steelblue",
barcolor = "steelblue")
{
best_nc <- x$Best.nc
if (any(class(best_nc) == "numeric") )
print(best_nc)
else if (any(class(best_nc) == "matrix") ) {
best_nc <- as.data.frame(t(best_nc), stringsAsFactors = TRUE)
best_nc$Number_clusters <- as.factor(best_nc$Number_clusters)
if (print.summary) {
ss <- summary(best_nc$Number_clusters)
cat("Among all indices: \n===================\n")
for (i in 1:length(ss)) {
cat("*", ss[i], "proposed ", names(ss)[i],
"as the best number of clusters\n")
}
cat("\nConclusion\n=========================\n")
cat("* According to the majority rule, the best number of clusters is ",
names(which.max(ss)), ".\n\n")
}
df <- data.frame(Number_clusters = names(ss), freq = ss,
stringsAsFactors = TRUE)
p <- ggpubr::ggbarplot(df, x = "Number_clusters",
y = "freq", fill = barfill, color = barcolor) +
labs(x = "Number of clusters k", y = "Frequency among all indices",
title = paste0("Optimal number of clusters - k = ",
names(which.max(ss))))
return(p)
}
}
res.nbclust <- clg[,-1] %>% scale() %>% NbClust(distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index ="all")
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 6 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
fviz_nbclust(res.nbclust, ggtheme = theme_minimal())
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 6 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 6 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 3 .
set.seed(123)
km.res <- kmeans(matstd_clg, 3, nstart = 25)
# Visualize
fviz_cluster(km.res, data = matstd_clg,
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal())
pam.res <- pam(matstd_clg, 3)
fviz_cluster(pam.res)
res.hc <- matstd_clg %>% scale() %>% dist(method = "euclidean") %>%
hclust(method = "ward.D2")
fviz_dend(res.hc, k = 3, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)
## Warning in get_col(col, k): Length of color vector was longer than the number
## of clusters - first k elements are used
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fit.pc <- principal(clg[,-1], nfactors=5, rotate="varimax")
fit.pc$scores
## RC1 RC3 RC4
## Virginia Tech 1.485468071 0.21131139 -0.21998370
## Maryland, Baltimore County -0.481006334 0.76030425 -0.31548733
## University of Kansas 0.177350813 1.40427674 0.02317481
## Rutgers, New Brunswick 3.133346443 -0.54184707 0.10004650
## New Jersey Institute of Technology -0.698565381 0.57433793 0.29395115
## Penn State 1.263617108 0.06342282 0.84411017
## University of California, Irvine 0.283893077 0.27706042 2.08221597
## Rutgers , Newark -0.677051736 0.74194350 0.27258849
## University of Utah -0.356647575 3.34936020 -1.18308501
## University of Texas, Austin 1.271969226 1.44186502 0.74986789
## SUNY College at Geneseo 0.008424103 -1.52037815 1.98538731
## University of Wisconsin at Madison 1.516380130 0.26320285 -0.09742845
## St. Mary's College of Maryland -1.124429195 0.14350357 1.14223565
## Washington State University 0.185200681 0.95130029 0.31005570
## Lincoln University -1.161856724 0.18943920 -0.60705986
## Montclair State University -0.377018822 -0.01890837 -0.20013527
## University of Massachusetts, Amherst 1.576944277 -0.15158429 -1.40007992
## Mount Vernon Nazarene College -0.364193850 -1.23343747 -0.82760015
## Carnegie Mellon University -0.224847119 -0.25735884 1.07996340
## Campbellsville College -1.032269951 -0.40964800 0.33895063
## Hillsdale College -0.206749033 -1.13226255 0.14398325
## Austin College -0.377970174 -0.66656689 0.56118337
## Fresno Pacific College -1.061532078 -0.09982983 0.80092587
## Roger Williams University 0.162940212 -1.47084476 -2.46329998
## Point Park College -0.114030259 -0.81216742 -1.83109281
## Bethany College -0.819570894 0.05679587 -0.65195750
## Caldwell College -0.859520115 -0.02187551 -0.34922095
## Southern California College -0.792310218 -0.03837150 -0.97211902
## University of Evansville 0.021108340 -1.20027859 0.28912388
## Cornell College -0.357073020 -0.85276482 0.10078590
## RC2 RC5
## Virginia Tech 0.93933366 -1.37146513
## Maryland, Baltimore County -1.04433771 0.69037664
## University of Kansas 0.02120010 -0.49041703
## Rutgers, New Brunswick -1.04843438 0.72216022
## New Jersey Institute of Technology -0.05933922 1.65202697
## Penn State -0.32614786 -0.30181199
## University of California, Irvine -0.18008499 1.01341401
## Rutgers , Newark -0.74251293 1.09254223
## University of Utah 0.52779424 -0.09370849
## University of Texas, Austin -1.02061303 -0.72998530
## SUNY College at Geneseo -1.37428272 -0.58696470
## University of Wisconsin at Madison 1.12649462 -0.17115117
## St. Mary's College of Maryland -0.13265786 0.40476349
## Washington State University 0.67877810 -0.88696370
## Lincoln University -0.60445947 -0.01578063
## Montclair State University -2.23511230 0.88218839
## University of Massachusetts, Amherst 0.24209556 0.12073221
## Mount Vernon Nazarene College -1.01965290 -1.24747394
## Carnegie Mellon University 2.18335679 2.38731938
## Campbellsville College -1.37511652 -1.29394799
## Hillsdale College 1.25164830 0.35262799
## Austin College 1.23320379 0.05805328
## Fresno Pacific College 0.70563964 -1.79219884
## Roger Williams University -0.17658194 1.05281187
## Point Park College 0.04786513 0.68789793
## Bethany College 0.98875049 -1.46132669
## Caldwell College -0.19445128 1.06430250
## Southern California College -0.33594369 -0.65912083
## University of Evansville 0.43124829 -0.68883908
## Cornell College 1.49232006 -0.39006159
fa.parallel(clg[,-1])
## Parallel analysis suggests that the number of factors = 2 and the number of components = 2
fa.diagram(fit.pc)
summary(vss(clg[,-1]))
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
##
## Very Simple Structure
## VSS complexity 1 achieves a maximimum of 0.68 with 2 factors
## VSS complexity 2 achieves a maximimum of 0.85 with 2 factors
##
## The Velicer MAP criterion achieves a minimum of 0.11 with 4 factors
##
str(clg)
## 'data.frame': 30 obs. of 19 variables:
## $ Private : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Apps : int 15712 4269 8579 48094 1879 19315 15698 5785 5095 14752 ...
## $ Accept : int 11719 2594 5561 26330 1216 10344 10775 2690 4491 9572 ...
## $ Acceptance_70: int 1 0 0 0 0 0 0 0 1 0 ...
## $ Enroll : int 4277 985 3681 4520 483 3450 2478 499 2400 5329 ...
## $ Top10perc : int 29 27 25 36 27 48 85 26 27 48 ...
## $ Top25perc : int 53 57 50 79 62 93 100 62 53 85 ...
## $ F.Undergrad : int 18511 6476 17880 21401 3311 28938 12677 4005 13894 30017 ...
## $ P.Undergrad : int 604 2592 1673 3712 1646 2025 864 1886 8374 5189 ...
## $ Outstate : int 10260 8594 6994 7410 8832 10645 12024 7410 6857 5130 ...
## $ Room.Board : int 3176 4408 3384 4748 5376 4060 5302 4748 3975 3309 ...
## $ Books : int 740 494 700 690 700 512 790 690 858 650 ...
## $ Personal : int 2200 2768 2681 2009 1850 2394 1818 2009 3093 3140 ...
## $ PhD : int 85 82 88 90 92 77 96 90 89 91 ...
## $ Terminal : int 89 88 94 95 98 96 96 95 93 99 ...
## $ S.F.Ratio : num 13.8 18.4 13.7 19.5 13.5 18.1 16.1 17.4 12.8 19.7 ...
## $ perc.alumni : int 20 6 17 19 19 19 11 16 9 11 ...
## $ Expend : int 8944 7618 9657 10474 12529 8992 15934 11878 9275 7837 ...
## $ Grad.Rate : int 73 55 57 77 72 63 66 58 37 65 ...
clg[clg$Private == 0,]$Private <- "Public"
clg[clg$Private == 1,]$Private <- "Private"
clg$Private <- as.factor(clg$Private)
clg$Acceptance_70 <- ifelse(test=clg$Acceptance_70 == 0, yes="Acceptance <= 70%", no="Acceptance > 70%")
clg$Acceptance_70 <- as.factor(clg$Acceptance_70 )
str(clg)
## 'data.frame': 30 obs. of 19 variables:
## $ Private : Factor w/ 2 levels "Private","Public": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : int 15712 4269 8579 48094 1879 19315 15698 5785 5095 14752 ...
## $ Accept : int 11719 2594 5561 26330 1216 10344 10775 2690 4491 9572 ...
## $ Acceptance_70: Factor w/ 2 levels "Acceptance <= 70%",..: 2 1 1 1 1 1 1 1 2 1 ...
## $ Enroll : int 4277 985 3681 4520 483 3450 2478 499 2400 5329 ...
## $ Top10perc : int 29 27 25 36 27 48 85 26 27 48 ...
## $ Top25perc : int 53 57 50 79 62 93 100 62 53 85 ...
## $ F.Undergrad : int 18511 6476 17880 21401 3311 28938 12677 4005 13894 30017 ...
## $ P.Undergrad : int 604 2592 1673 3712 1646 2025 864 1886 8374 5189 ...
## $ Outstate : int 10260 8594 6994 7410 8832 10645 12024 7410 6857 5130 ...
## $ Room.Board : int 3176 4408 3384 4748 5376 4060 5302 4748 3975 3309 ...
## $ Books : int 740 494 700 690 700 512 790 690 858 650 ...
## $ Personal : int 2200 2768 2681 2009 1850 2394 1818 2009 3093 3140 ...
## $ PhD : int 85 82 88 90 92 77 96 90 89 91 ...
## $ Terminal : int 89 88 94 95 98 96 96 95 93 99 ...
## $ S.F.Ratio : num 13.8 18.4 13.7 19.5 13.5 18.1 16.1 17.4 12.8 19.7 ...
## $ perc.alumni : int 20 6 17 19 19 19 11 16 9 11 ...
## $ Expend : int 8944 7618 9657 10474 12529 8992 15934 11878 9275 7837 ...
## $ Grad.Rate : int 73 55 57 77 72 63 66 58 37 65 ...
xtabs(~ Acceptance_70 + Private, data=clg)
## Private
## Acceptance_70 Private Public
## Acceptance <= 70% 3 12
## Acceptance > 70% 10 5
logistic_simple <- glm(Acceptance_70 ~ Private, data=clg, family="binomial")
summary(logistic_simple)
##
## Call:
## glm(formula = Acceptance_70 ~ Private, family = "binomial", data = clg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.71251 -0.83463 -0.05513 0.72438 1.56447
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2040 0.6583 1.829 0.0674 .
## PrivatePublic -2.0794 0.8466 -2.456 0.0140 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41.589 on 29 degrees of freedom
## Residual deviance: 34.642 on 28 degrees of freedom
## AIC: 38.642
##
## Number of Fisher Scoring iterations: 4
predicted.data <- data.frame(probability.of.acpt=logistic_simple$fitted.values,UniType=clg$Private)
xtabs(~ probability.of.acpt + UniType, data=predicted.data)
## UniType
## probability.of.acpt Private Public
## 0.294117647059127 0 17
## 0.769230769230409 13 0
logistic <- glm(Acceptance_70 ~ Private+Enroll+Top10perc+Top25perc+F.Undergrad+P.Undergrad+Terminal+S.F.Ratio+PhD+Grad.Rate, data=clg, family="binomial")
summary(logistic)
##
## Call:
## glm(formula = Acceptance_70 ~ Private + Enroll + Top10perc +
## Top25perc + F.Undergrad + P.Undergrad + Terminal + S.F.Ratio +
## PhD + Grad.Rate, family = "binomial", data = clg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.46267 -0.57981 0.00524 0.34716 2.33268
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 19.5120484 9.0770348 2.150 0.0316 *
## PrivatePublic -3.4850311 2.5468530 -1.368 0.1712
## Enroll -0.0006646 0.0023798 -0.279 0.7800
## Top10perc -0.1584332 0.1903560 -0.832 0.4052
## Top25perc 0.0064126 0.1530075 0.042 0.9666
## F.Undergrad 0.0003174 0.0004761 0.667 0.5051
## P.Undergrad 0.0007401 0.0005138 1.440 0.1498
## Terminal -0.1619397 0.2116517 -0.765 0.4442
## S.F.Ratio -0.5750719 0.3089933 -1.861 0.0627 .
## PhD -0.0389866 0.2206205 -0.177 0.8597
## Grad.Rate 0.1590752 0.1380443 1.152 0.2492
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41.589 on 29 degrees of freedom
## Residual deviance: 19.816 on 19 degrees of freedom
## AIC: 41.816
##
## Number of Fisher Scoring iterations: 7
predicted.data <- data.frame(probability.of.acpt=logistic$fitted.values,acpt=clg$Acceptance_70)
predicted.data <- predicted.data[order(predicted.data$probability.of.acpt, decreasing=FALSE),]
predicted.data$rank <- 1:nrow(predicted.data)
## Lastly, we can plot the predicted probabilities for each sample having
## heart disease and color by whether or not they actually had heart disease
ggplot(data=predicted.data, aes(x=rank, y=probability.of.acpt)) +
geom_point(aes(color=acpt), alpha=1, shape=4, stroke=2) +
xlab("Index") +
ylab("Predicted probability of getting heart disease")
roc(clg$Acceptance_70,logistic$fitted.values,plot=TRUE, legacy.axes=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4, percent=TRUE, print.auc=TRUE)
## Setting levels: control = Acceptance <= 70%, case = Acceptance > 70%
## Setting direction: controls < cases
##
## Call:
## roc.default(response = clg$Acceptance_70, predictor = logistic$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "#377eb8", lwd = 4, print.auc = TRUE)
##
## Data: logistic$fitted.values in 15 controls (clg$Acceptance_70 Acceptance <= 70%) < 15 cases (clg$Acceptance_70 Acceptance > 70%).
## Area under the curve: 92.89%
roc(clg$Acceptance_70, logistic_simple$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4, print.auc=TRUE)
## Setting levels: control = Acceptance <= 70%, case = Acceptance > 70%
## Setting direction: controls < cases
##
## Call:
## roc.default(response = clg$Acceptance_70, predictor = logistic_simple$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "#377eb8", lwd = 4, print.auc = TRUE)
##
## Data: logistic_simple$fitted.values in 15 controls (clg$Acceptance_70 Acceptance <= 70%) < 15 cases (clg$Acceptance_70 Acceptance > 70%).
## Area under the curve: 73.33%
plot.roc(clg$Acceptance_70, logistic$fitted.values, percent=TRUE, col="#4daf4a", lwd=4, print.auc=TRUE, add=TRUE, print.auc.y=40)
## Setting levels: control = Acceptance <= 70%, case = Acceptance > 70%
## Setting direction: controls < cases
legend("bottomright", legend=c("Simple", "Non Simple"), col=c("#377eb8", "#4daf4a"), lwd=4)
clg1 <- read.csv("/Users/ajayvishnu/Desktop/RUTGERS/Spring_2023/Multivariate Analysis/Datasets/College_Data_cleaned.csv", row.names = 1)
attach(clg1)
## The following objects are masked from clg:
##
## Accept, Acceptance_70, Apps, Books, Enroll, Expend, F.Undergrad,
## Grad.Rate, Outstate, P.Undergrad, perc.alumni, Personal, PhD,
## Private, Room.Board, S.F.Ratio, Terminal, Top10perc, Top25perc
clg1[clg1$Private == 0,]$Private <- "Public"
clg1[clg1$Private == 1,]$Private <- "Private"
clg1.data <- as.matrix(clg1[,c(2:19)])
clg1_raw <- cbind(clg1.data, as.numeric(as.factor(clg1$Private))-1)
colnames(clg1_raw)[19] <- "UniType"
smp_size_raw <- floor(0.75 * nrow(clg1_raw))
train_ind_raw <- sample(nrow(clg1_raw), size = smp_size_raw)
train_raw.df <- as.data.frame(clg1_raw[train_ind_raw, ])
test_raw.df <- as.data.frame(clg1_raw[-train_ind_raw, ])
clg1_raw.lda <- lda(formula = train_raw.df$UniType ~ ., data = train_raw.df)
plot(clg1_raw.lda, dimen = 1, type = "b")
clg1_raw.lda.predict <- predict(clg1_raw.lda, newdata = test_raw.df)
clg1_raw.lda.predict.posteriors <- as.data.frame(clg1_raw.lda.predict$posterior)
pred <- prediction(clg1_raw.lda.predict.posteriors[,2], test_raw.df$UniType)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
plot(roc.perf)
abline(a=0, b= 1)
text(x = .25, y = .65 ,paste("AUC = ", round(auc.train[[1]],3), sep = ""))